##############################################
# WE CURRENTLY HAVE THREE SOURCES OF DOCUMENTS
# 1) 20NEWS
# 2) REUTERS
# 3) LUIS' NEWS FEED
#############################################

PROJECT = ~/text-categorization-som/text
RENAME = ./rename.sh
SOURCES_DIR = $(PROJECT)/data/sources
NEWSFEED = ./newsfeedrecollector.py

# 20NEWS Variables
WEB_SOURCE = http://qwone.com/~jason/20Newsgroups
TARBALL = $(SOURCES_DIR)/20news-bydate.tar.gz
TEST_DIR = $(SOURCES_DIR)/20news-bydate-test
TRAIN_DIR = $(SOURCES_DIR)/20news-bydate-train
DIRECTORIES = $(TRAIN_DIR)/alt.atheism $(TRAIN_DIR)/comp.graphics $(TRAIN_DIR)/comp.os.ms-windows.misc $(TRAIN_DIR)/comp.sys.ibm.pc.hardware $(TRAIN_DIR)/comp.sys.mac.hardware $(TRAIN_DIR)/comp.windows.x $(TRAIN_DIR)/misc.forsale $(TRAIN_DIR)/rec.autos $(TRAIN_DIR)/rec.motorcycles $(TRAIN_DIR)/rec.sport.baseball $(TRAIN_DIR)/rec.sport.hockey $(TRAIN_DIR)/sci.crypt $(TRAIN_DIR)/sci.electronics $(TRAIN_DIR)/sci.med $(TRAIN_DIR)/sci.space $(TRAIN_DIR)/soc.religion.christian $(TRAIN_DIR)/talk.politics.guns $(TRAIN_DIR)/talk.politics.mideast $(TRAIN_DIR)/talk.politics.misc $(TRAIN_DIR)/talk.religion.misc $(TEST_DIR)/alt.atheism $(TEST_DIR)/comp.graphics $(TEST_DIR)/comp.os.ms-windows.misc $(TEST_DIR)/comp.sys.ibm.pc.hardware $(TEST_DIR)/comp.sys.mac.hardware $(TEST_DIR)/comp.windows.x $(TEST_DIR)/misc.forsale $(TEST_DIR)/rec.autos $(TEST_DIR)/rec.motorcycles $(TEST_DIR)/rec.sport.baseball $(TEST_DIR)/rec.sport.hockey $(TEST_DIR)/sci.crypt $(TEST_DIR)/sci.electronics $(TEST_DIR)/sci.med $(TEST_DIR)/sci.space $(TEST_DIR)/soc.religion.christian $(TEST_DIR)/talk.politics.guns $(TEST_DIR)/talk.politics.mideast $(TEST_DIR)/talk.politics.misc $(TEST_DIR)/talk.religion.misc


# REUTERS Variables
REUTERS_ZIP = $(SOURCES_DIR)/reuters.zip
REUTERS_DIR = $(SOURCES_DIR)/reuters
REUTERS_DIRECTORIES = $(REUTERS_DIR)/test $(REUTERS_DIR)/training

all: get20news getnewsfeed getreuters

########################
# RULE: get20news
# DESCRIPTION: 
#      Get the 20News documents from the webpage,
#      extract them and rename them
# INPUT: 
#     DIRECTORIES   -  A List of directory where the files are
# OUTPUT: 
#     None
########################
get20news: $(RENAME)
	wget $(WEB_SOURCE)/$(TARBALL)
	tar xvf $(TARBALL)
	$(RENAME) $(DIRECTORIES)



########################
# RULE: getnewsfeed
# DESCRIPTION: 
#      Get the news from reuters
# INPUT: 
#     None
# OUTPUT: 
#     None
########################
getnewsfeed: $(NEWSFEED)
	$(NEWSFEED)


########################
# RULE: getreuters
# DESCRIPTION: 
#      Extracts the corpus included in nltk
# INPUT: 
#     REUTERS_ZIP  - Zip containing the reuters documents
#     REUTERS_DIRECTORIES - Directories inside the Zip file
# OUTPUT: 
#     None
########################

getreuters: $(RENAME)
	#rm -rf $(SOURCES_DIR)/reuters
	unzip $(REUTERS_ZIP)
	$(RENAME) $(REUTERS_DIRECTORIES)


clean:
	rm $(TARBALL)
	rm -fr $(TEST_DIR) $(TRAIN_DIR)
	rm -rf $(REUTERS_ZIP) $(REUTERS_DIR)
